home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
CU Amiga Super CD-ROM 15
/
CU Amiga Magazine's Super CD-ROM 15 (1997)(EMAP Images)(GB)[!][issue 1997-10].iso
/
CUCD
/
Graphics
/
Ghostscript
/
source
/
iutilasm.asm
< prev
next >
Wrap
Assembly Source File
|
1994-07-27
|
12KB
|
697 lines
; Copyright (C) 1989, 1992, 1993 Aladdin Enterprises. All rights reserved.
;
; This file is part of Aladdin Ghostscript.
;
; Aladdin Ghostscript is distributed with NO WARRANTY OF ANY KIND. No author
; or distributor accepts any responsibility for the consequences of using it,
; or for whether it serves any particular purpose or works at all, unless he
; or she says so in writing. Refer to the Aladdin Ghostscript Free Public
; License (the "License") for full details.
;
; Every copy of Aladdin Ghostscript must include a copy of the License,
; normally in a plain ASCII text file named PUBLIC. The License grants you
; the right to copy, modify and redistribute Aladdin Ghostscript, but only
; under certain conditions described in the License. Among other things, the
; License requires that the copyright notice and this notice be preserved on
; all copies.
; iutilasm.asm
; Assembly code for Ghostscript interpreter on MS-DOS systems
ifdef FOR80386
.286c
endif
utilasm_TEXT SEGMENT WORD PUBLIC 'CODE'
ASSUME CS:utilasm_TEXT
ifdef FOR80386
; Macro for 32-bit operand prefix.
OP32 macro
db 66h
endm
endif ; FOR80386
; Clear a register
clear macro reg
xor reg,reg
endm
ifdef FOR80386
; Replace the multiply and divide routines in the Turbo C library
; if we are running on an 80386.
; Macro to swap the halves of a 32-bit register.
; Unfortunately, masm won't allow a shift instruction with a count of 16,
; so we have to code it in hex.
swap macro regno
OP32
db 0c1h,0c0h+regno,16 ; rol regno,16
endm
regax equ 0
regcx equ 1
regdx equ 2
regbx equ 3
; Multiply (dx,ax) by (cx,bx) to (dx,ax).
PUBLIC LXMUL@
PUBLIC F_LXMUL@
F_LXMUL@ proc far
LXMUL@ proc far
swap regdx
mov dx,ax
swap regcx
mov cx,bx
OP32
db 0fh,0afh,0d1h ; imul dx,cx
OP32
mov ax,dx
swap regdx
ret
LXMUL@ endp
F_LXMUL@ endp
; Divide two stack operands, leave the result in (dx,ax).
ifdef DEBUG
setup32 macro
mov bx,sp
push bp
mov bp,sp
OP32
mov ax,ss:[bx+4] ; dividend
endm
ret32 macro n
mov sp,bp
pop bp
ret n
endm
else ; !DEBUG
setup32 macro
mov bx,sp
OP32
mov ax,ss:[bx+4] ; dividend
endm
ret32 macro n
ret n
endm
endif ; (!)DEBUG
PUBLIC LDIV@, LUDIV@, LMOD@, LUMOD@
PUBLIC F_LDIV@, F_LUDIV@, F_LMOD@, F_LUMOD@
F_LDIV@ proc far
LDIV@ proc far
setup32
OP32
cwd
OP32
idiv word ptr ss:[bx+8] ; divisor
OP32
mov dx,ax
swap regdx
ret32 8
LDIV@ endp
F_LDIV@ endp
F_LUDIV@ proc far
LUDIV@ proc far
setup32
OP32
xor dx,dx
OP32
div word ptr ss:[bx+8] ; divisor
OP32
mov dx,ax
swap regdx
ret32 8
LUDIV@ endp
F_LUDIV@ endp
F_LMOD@ proc far
LMOD@ proc far
setup32
OP32
cwd
OP32
idiv word ptr ss:[bx+8] ; divisor
OP32
mov ax,dx
swap regdx
ret32 8
LMOD@ endp
F_LMOD@ endp
F_LUMOD@ proc far
LUMOD@ proc far
setup32
OP32
xor dx,dx
OP32
div word ptr ss:[bx+8] ; divisor
OP32
mov ax,dx
swap regdx
ret32 8
LUMOD@ endp
F_LUMOD@ endp
else ; !FOR80386
; Replace the divide routines in the Turbo C library,
; which do the division one bit at a time (!).
PUBLIC LDIV@, LMOD@, LUDIV@, LUMOD@
PUBLIC F_LDIV@, F_LMOD@, F_LUDIV@, F_LUMOD@
; Negate a long on the stack.
negbp macro offset
neg word ptr [bp+offset+2] ; high part
neg word ptr [bp+offset] ; low part
sbb word ptr [bp+offset+2],0
endm
; Negate a long in (dx,ax).
negr macro
neg dx
neg ax
sbb dx,0
endm
; Divide two unsigned longs on the stack.
; Leave either the quotient or the remainder in (dx,ax).
; Operand offsets assume that bp (and only bp) has been pushed.
nlo equ 6
nhi equ 8
dlo equ 10
dhi equ 12
; We use an offset in bx to distinguish div from mod,
; and to indicate whether the result should be negated.
odiv equ 0
omod equ 2
odivneg equ 4
omodneg equ 6
F_LMOD@ proc far
LMOD@ proc far
push bp
mov bp,sp
mov bx,omod
; Take abs of denominator
cmp byte ptr [bp+dhi+1],bh ; bh = 0
jge modpd
negbp dlo
modpd: ; Negate mod if numerator < 0
cmp byte ptr [bp+nhi+1],bh ; bh = 0
jge udiv
mov bx,omodneg
negnum: negbp nlo
jmp udiv
LMOD@ endp
F_LMOD@ endp
F_LUMOD@ proc far
LUMOD@ proc far
mov bx,omod
jmp udpush
LUMOD@ endp
F_LUMOD@ endp
F_LDIV@ proc far
LDIV@ proc far
push bp
mov bp,sp
mov bx,odiv
; Negate quo if num^den < 0
mov ax,[bp+nhi]
xor ax,[bp+dhi]
jge divabs
mov bx,odivneg
divabs: ; Take abs of denominator
cmp byte ptr [bp+dhi+1],bh ; bh = 0
jge divpd
negbp dlo
divpd: ; Take abs of numerator
cmp byte ptr [bp+nhi+1],bh ; bh = 0
jge udiv
jmp negnum
LDIV@ endp
F_LDIV@ endp
F_LUDIV@ proc far
LUDIV@ proc far
mov bx,odiv
udpush: push bp
mov bp,sp
udiv: push bx ; odiv, omod, odivneg, omodneg
mov ax,[bp+nlo]
mov dx,[bp+nhi]
mov bx,[bp+dlo]
mov cx,[bp+dhi]
; Now we are dividing dx:ax by cx:bx.
; Check to see whether this is really a 32/16 division.
or cx,cx
jnz div2
; 32/16, check for 16- vs. 32-bit quotient
cmp dx,bx
jae div1
; 32/16 with 16-bit quotient, just do it.
div bx ; ax = quo, dx = rem
pop bx
pop bp
jmp cs:xx1[bx]
even
xx1 dw offset divx1
dw offset modx1
dw offset divx1neg
dw offset modx1neg
modx1: mov ax,dx
divx1: xor dx,dx
ret 8
modx1neg: mov ax,dx
divx1neg: xor dx,dx
rneg: negr
ret 8
; 32/16 with 32-bit quotient, do in 2 parts.
div1: mov cx,ax ; save lo num
mov ax,dx
xor dx,dx
div bx ; ax = hi quo
xchg cx,ax ; save hi quo, get lo num
div bx ; ax = lo quo, dx = rem
pop bx
pop bp
jmp cs:xx1a[bx]
even
xx1a dw offset divx1a
dw offset modx1
dw offset divx1aneg
dw offset modx1neg
divx1a: mov dx,cx ; hi quo
ret 8
divx1aneg: mov dx,cx
jmp rneg
; This is really a 32/32 bit division.
; (Note that the quotient cannot exceed 16 bits.)
; The following algorithm is taken from pp. 235-240 of Knuth, vol. 2
; (first edition).
; Start by normalizing the numerator and denominator.
div2: or ch,ch
jz div21 ; ch == 0, but cl != 0
; Do 8 steps all at once.
mov bl,bh
mov bh,cl
mov cl,ch
xor ch,ch
mov al,ah
mov ah,dl
mov dl,dh
xor dh,dh
rol bx,1 ; faster than jmp
div2a: rcr bx,1 ; finish previous shift
div21: shr dx,1
rcr ax,1
shr cx,1
jnz div2a
rcr bx,1
; Now we can do a 32/16 divide.
div2x: div bx ; ax = quo, dx = rem
; Multiply by the denominator, and correct the result.
mov cx,ax ; save quotient
mul word ptr [bp+dhi]
mov bx,ax ; save lo part of hi product
mov ax,cx
mul word ptr [bp+dlo]
add dx,bx
; Now cx = trial quotient, (dx,ax) = cx * denominator.
not dx
neg ax
cmc
adc dx,0 ; double-precision neg
jc divz ; zero quotient
; requires special handling
add ax,[bp+nlo]
adc dx,[bp+nhi]
jc divx
; Quotient is too large, adjust it.
div3: dec cx
add ax,[bp+dlo]
adc dx,[bp+dhi]
jnc div3
; All done. (dx,ax) = remainder, cx = lo quotient.
divx: pop bx
pop bp
jmp cs:xx3[bx]
even
xx3 dw offset divx3
dw offset modx3
dw offset divx3neg
dw offset modx3neg
divx3: mov ax,cx
xor dx,dx
modx3: ret 8
divx3neg: mov ax,cx
xor dx,dx
modx3neg: jmp rneg
; Handle zero quotient specially.
divz: pop bx
jmp cs:xxz[bx]
even
xxz dw offset divxz
dw offset modxz
dw offset divxz
dw offset modxzneg
divxz: pop bp
ret 8
modxzneg: negbp nlo
modxz: mov ax,[bp+nlo]
mov dx,[bp+nhi]
pop bp
ret 8
LUDIV@ endp
F_LUDIV@ endp
endif ; FOR80386
ifdef NOFPU
; See gsmisc.c for the C version of this code.
; /*
; * Floating multiply with fixed result, for avoiding floating point in
; * common coordinate transformations. Assumes IEEE representation,
; * 16-bit short, 32-bit long. Optimized for the case where the first
; * operand has no more than 16 mantissa bits, e.g., where it is a user space
; * coordinate (which are often integers).
; *
; * The assembly language version of this code is actually faster than
; * the FPU, if the code is compiled with FPU_TYPE=0 (which requires taking
; * a trap on every FPU operation). If there is no FPU, the assembly
; * language version of this code is over 10 times as fast as the
; * emulated FPU.
; */
; fixed
; fmul2fixed_(long /*float*/ a, long /*float*/ b)
; {
PUBLIC _fmul2fixed_
_fmul2fixed_ proc far
push bp
mov bp,sp
a equ 6
alo equ a
ahi equ a+2
b equ 10
blo equ b
bhi equ b+2
push si ; will hold ma
push di ; will hold mb
; int e = 260 + _fixed_shift - ((
; (((uint)(a >> 16)) & 0x7f80) + (((uint)(b >> 16)) & 0x7f80)
; ) >> 7);
mov dx,[bp+ahi]
; dfmul2fixed enters here
fmf: mov cx,260+12
mov ax,[bp+bhi]
and ax,7f80h
and dx,7f80h
add ax,dx
xchg ah,al ; ror ax,7 without using cl
rol ax,1
sub cx,ax
push cx ; e
; ulong ma = (ushort)(a >> 8) | 0x8000;
; ulong mb = (ushort)(b >> 8) | 0x8000;
mov si,[bp+alo+1] ; unaligned
clear ax
mov di,[bp+blo+1] ; unaligned
or si,8000h
or di,8000h
; ulong p1 = ma * (b & 0xff);
mov al,[bp+blo]
mul si
; (Do this later:)
; ulong p = ma * mb;
; if ( (byte)a ) /* >16 mantissa bits */
cmp byte ptr [bp+alo],0
je mshort
; { ulong p2 = (a & 0xff) * mb;
; p += ((((uint)(byte)a * (uint)(byte)b) >> 8) + p1 + p2) >> 8;
mov cx,dx
mov bx,ax
clear ax
mov al,[bp+alo]
clear dx
mov dl,[bp+blo]
mul dx
mov dl,ah ; dx is zero
add bx,cx
adc cx,0
clear ax
mov al,[bp+alo]
mul di
add ax,bx
adc dx,cx
; }
mshort:
; else
; p += p1 >> 8;
mov bl,ah ; set (cx,bx) = (dx,ax) >> 8
mov bh,dl
clear cx
mov cl,dh
mov ax,si
mul di
add ax,bx
adc dx,cx
; if ( (uint)e < 32 ) /* e = -1 is possible */
pop cx ; e
cmp cx,16
jb shr1
; else if ( e >= 32 ) /* also detects a=0 or b=0 */
cmp cx,0
jl eneg
sub cx,16
cmp cx,16
jge shr0
mov ax,dx
clear dx
shr ax,cl
jmp ex
; return fixed_0;
shr0: clear ax
clear dx
jmp ex
; else
; p <<= -e;
even
eneg: neg cx
shl dx,cl
mov bx,ax
shl ax,cl
rol bx,cl
xor bx,ax
add dx,bx
jmp ex
; p >>= e;
even
shr1: shr ax,cl
mov bx,dx
shr dx,cl
ror bx,cl
xor bx,dx
add ax,bx
ex:
; return ((a ^ b) < 0 ? -p : p);
mov cx,[bp+ahi]
xor cx,[bp+bhi]
jge pos
neg dx
neg ax
sbb dx,0
pos:
; }
retu: pop di
pop si
mov sp,bp
pop bp
ret
_fmul2fixed_ ENDP
; The same routine with the first argument a double rather than a float.
; The argument is split into two pieces to reduce data movement.
PUBLIC _dfmul2fixed_
_dfmul2fixed_ proc far
push bp
mov bp,sp
xalo equ 6
;b equ 10
xahi equ 14
push si ; overlap this below
push di ; ditto
; Shuffle the arguments and then use fmul2fixed.
; Squeeze 3 exponent bits out of the top 35 bits of a.
mov dx,[bp+xahi+2]
mov bx,0c000h
mov ax,[bp+xahi]
and bx,dx
mov cx,[bp+xalo+2]
and dx,7ffh ; get rid of discarded bits
add cx,cx ; faster than shl!
jz cz ; detect common case
adc ax,ax ; faster than rcl!
adc dx,dx
add cx,cx
adc ax,ax
adc dx,dx
add cx,cx
adc ax,ax
mov [bp+alo],ax
adc dx,dx
or dx,bx
mov [bp+ahi],dx
jmp fmf
even
cz: adc ax,ax
adc dx,dx
add ax,ax
adc dx,dx
add ax,ax
mov [bp+alo],ax
adc dx,dx
or dx,bx
mov [bp+ahi],dx
jmp fmf
_dfmul2fixed_ ENDP
endif ; NOFPU
; Transpose an 8x8 bit matrix. See gsmisc.c for the algorithm in C.
PUBLIC _memflip8x8
_memflip8x8 proc far
push ds
push si
push di
; After pushing, the offsets of the parameters are:
; byte *inp=10, int line_size=14, byte *outp=16, int dist=20.
mov si,sp
mov di,ss:[si+14] ; line_size
lds si,ss:[si+10] ; inp
; We assign variables to registers as follows:
; ax = AE, bx = BF, cx (or di) = CG, dx = DH.
; Load the input data. Initially we assign
; ax = AB, bx = EF, cx (or di) = CD, dx = GH.
mov ah,[si]
iload macro reg
add si,di
mov reg,[si]
endm
iload al
iload ch
iload cl
iload bh
iload bl
iload dh
iload dl
; Transposition macro, see C code for explanation.
trans macro reg1,reg2,shift,mask
mov si,reg1
shr si,shift
xor si,reg2
and si,mask
xor reg2,si
shl si,shift
xor reg1,si
endm
; Do 4x4 transpositions
mov di,cx ; we need cl for the shift count
mov cl,4
trans bx,ax,cl,0f0fh
trans dx,di,cl,0f0fh
; Swap B/E, D/G
xchg al,bh
mov cx,di
xchg cl,dh
; Do 2x2 transpositions
mov di,cx ; need cl again
mov cl,2
trans di,ax,cl,3333h
trans dx,bx,cl,3333h
mov cx,di ; done shifting >1
; Do 1x1 transpositions
trans bx,ax,1,5555h
trans dx,cx,1,5555h
; Store result
mov si,sp
mov di,ss:[si+20] ; dist
lds si,ss:[si+16] ; outp
mov [si],ah
istore macro reg
add si,di
mov [si],reg
endm
istore bh
istore ch
istore dh
istore al
istore bl
istore cl
istore dl
; All done
pop di
pop si
pop ds
ret
_memflip8x8 ENDP
utilasm_TEXT ENDS
END